In [5]:
from tableauscraper import TableauScraper as TS

url = "https://visualizedata.ucop.edu/t/Public/views/TransferbyCCM/Bymajorname"

ts = TS()
ts.loads(url)
workbook = ts.getWorkbook()

for t in workbook.worksheets:
    # Show worksheet name
    print(f"worksheet name : {t.name}") 
worksheet name : Broad Disc Select Label
worksheet name : Campus Broad
worksheet name : Major Table
worksheet name : Yield and Admit
In [2]:
# Find filter values
filters = {}
for t in workbook.worksheets:
    # Show worksheet name
    print(f"name : {t.name}") 
    # Show filters
    print(t.getFilters()) 
    # Save filter settings for later use
    for f in t.getFilters():
        print(f['column'])
        filters[f['column']] = f
name : Broad Disc Select Label
[]
name : Campus Broad
[]
name : Major Table
[{'column': 'Campus', 'ordinal': 0, 'values': ['UCB', 'UCD', 'UCI', 'UCLA', 'UCM', 'UCR', 'UCSB', 'UCSC', 'UCSD'], 'globalFieldName': '[federated.1xtee380yrajk410jnsfd1fsy47h].[none:CMP_LOC_LOC1_SHRT_DESC:nk]', 'selection': ['UCB'], 'selectionAlt': [{'fn': '[federated.1xtee380yrajk410jnsfd1fsy47h].[none:CMP_LOC_LOC1_SHRT_DESC:nk]', 'columnFullNames': ['[Campus]'], 'domainTables': [{'isSelected': True, 'label': 'UCB'}]}]}, {'column': 'T Acad Yr', 'ordinal': 0, 'values': [2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021], 'globalFieldName': '[federated.1xtee380yrajk410jnsfd1fsy47h].[none:T_ACAD_YR:ok]', 'selection': [2021], 'selectionAlt': [{'fn': '[federated.1xtee380yrajk410jnsfd1fsy47h].[none:T_ACAD_YR:ok]', 'columnFullNames': ['[T Acad Yr]'], 'domainTables': [{'label': '2012'}]}]}]
Campus
T Acad Yr
name : Yield and Admit
[]
In [3]:
# Get the values for the Campus filter
values = filters['Campus']['values']
values
Out[3]:
['UCB', 'UCD', 'UCI', 'UCLA', 'UCM', 'UCR', 'UCSB', 'UCSC', 'UCSD']
In [6]:
dataYA={}
dataMT={}
dataCB={}
dataBDSL={}

# For each campus, create dataframes for each worksheet
for c in values:
    ws = ts.getWorksheet("Major Table")
    ucWB = ws.setFilter('Campus',c)
    UC = ucWB.getWorksheet('Yield and Admit')
    dataYA[c] = UC.data
    UC = ucWB.getWorksheet('Broad Disc Select Label')
    dataBDSL[c] = UC.data
    UC = ucWB.getWorksheet('Major Table')
    dataMT[c] = UC.data
    dataMT[c]['University'] = c
    UC = ucWB.getWorksheet('Campus Broad')
    dataCB[c] = UC.data
    print(c)
UCB
UCD
UCI
UCLA
UCM
UCR
UCSB
UCSC
UCSD
In [7]:
from collections import Counter

# Count the number of occurances of each major
dataMT.keys()
counter = Counter()
for i in dataMT.keys():
    counter.update(dataMT[i]['Major name-value'].tolist())
In [8]:
values_rate = []
# Find each university's admit rate
for c in values:
    values_rate.append((c, dataYA[c][dataYA[c]['Measure Names-alias'].isin(['Admit rate'])].iat[0,2]))

# Sort universities by admit rate
values_rate.sort(key=lambda x:x[1])
values_rate = [x[0] for x in values_rate]
values_rate
Out[8]:
['UCLA', 'UCB', 'UCI', 'UCSB', 'UCD', 'UCSD', 'UCSC', 'UCM', 'UCR']
In [9]:
import pandas as pd

def create_majors_df(num_colleges):
    """ 
    Create a cleaned up data set with majors that 
    exist in more than num_colleges universities.
    """
    
    # Find majors that occur at more than requested colleges
    majors = [t[0] for t in counter.items() if t[1]>num_colleges]
    
    # Concatenate data frames from all universities
    df = dataMT[values_rate[0]].copy()

    for u in values_rate[1:]:
        df = pd.concat([df, dataMT[u]], axis=0)

    # Drop duplicated columns
    df = df[df['Major name-value'].isin(majors)]
    cols = [m for m in list(df.columns) if 'alias' in m]
    df = df.drop(columns=cols)

    # Clean up column names
    df = df.rename(
        columns=lambda x: x.replace('SUM(','')
        .replace('AGG(','')
        .replace(')','')
        .replace('-value',''))
    return df
In [10]:
# Create data frame with majors that exist in more than 7 UC's
tot_majors = create_majors_df(7)

# Pivot data frame to one column per major and admit rates per campus as cell values
pivot2 = tot_majors.pivot_table(index='University', columns='Major name', values='Admit rate')
pivot2 = pivot2.reindex(values_rate)
pivot2
Out[10]:
Major name Anthropology Chemistry Computer science Economics History Philosophy Physics Sociology
University
UCLA 0.366359 0.230769 0.023535 NaN NaN 0.406593 0.367021 NaN
UCB 0.145695 0.308271 0.039669 0.221339 0.188623 0.510448 0.398907 0.177609
UCI 0.417808 0.408676 0.111836 0.325048 0.685629 0.713725 0.351724 0.428203
UCSB 0.506329 0.561247 0.096636 0.578329 0.515385 0.462633 0.630252 0.539167
UCD 0.711864 0.770270 0.284636 0.594234 0.728682 0.615385 0.538462 0.686567
UCSD NaN 0.580882 0.158818 0.581800 0.727794 0.715415 0.555556 0.625150
UCSC 0.793846 0.454054 0.316677 0.741525 0.822526 0.812834 0.709302 0.667989
UCM 0.794521 NaN NaN 0.739130 0.785714 NaN NaN 0.731183
UCR 0.804651 0.592965 0.379851 0.810219 0.816406 0.847826 0.631148 0.817017
In [11]:
# Create a second data frame with more majors
tot_majors2 = create_majors_df(3)

# Drop unused columns
cols2 = [m for m in list(tot_majors.columns) if m not in ['Major name', 'Admit rate','Applicants']]
tot_majors2 = tot_majors2.drop(columns=cols2)

# Create new column with the partial sum of the number of applicants per major
tot_majors2['Total Applicants'] = tot_majors2.groupby(['Major name']).Applicants.transform('sum')

# Sort rows by new column
tot_majors2 = tot_majors2.sort_values(by=['Total Applicants'], ascending=False)
tot_majors2
Out[11]:
Major name Applicants Admit rate Total Applicants
5 Computer science 1397 0.096636 13297
21 Computer science 1997 0.023535 13297
32 Computer science 1458 0.284636 13297
19 Computer science 1996 0.158818 13297
14 Computer science 1340 0.379851 13297
... ... ... ... ...
76 Geology 20 0.3 182
71 Geology 30 0.7 182
67 Geology 22 0.318182 182
26 Geology 83 0.46988 182
47 Geology 27 0.444444 182

180 rows × 4 columns

In [12]:
# Create data frame with majors that exist in more than 7 UC's
tot_majors3 = create_majors_df(7)

# Create a new column for the lower GPA admit range limit
tot_majors3['Lower GPA'] = tot_majors['Admit GPA range'].str[0:4].astype(float)

# Pivot data frame to one column per major and lower GPA limit per campus as cell values
pivot = tot_majors3.pivot_table(index='University', columns='Major name', values='Lower GPA')
pivot = pivot.reindex(values_rate)
pivot
Out[12]:
Major name Anthropology Chemistry Computer science Economics History Philosophy Physics Sociology
University
UCLA 3.73 3.84 3.96 NaN NaN 3.77 3.84 NaN
UCB 3.67 3.61 3.81 3.69 3.81 3.66 3.54 3.64
UCI 3.69 3.64 3.92 3.79 3.44 3.41 3.65 3.69
UCSB 3.53 3.59 3.97 3.52 3.55 3.54 3.54 3.52
UCD 3.32 3.26 3.79 3.56 3.39 3.48 3.54 3.48
UCSD NaN 3.49 3.92 3.57 3.29 3.30 3.47 3.47
UCSC 3.05 3.16 3.43 3.15 3.05 3.07 3.29 3.06
UCM 3.02 NaN NaN 3.16 2.99 NaN NaN 3.08
UCR 2.98 3.02 3.51 3.07 3.00 3.05 3.03 3.02
In [13]:
import plotly.graph_objects as go

def drop_down_plot(df, title):
    """
    Create an interactive visualization with 
    a drop down menu to select filter.
    Inspired by: Shinichi Okada
    https://towardsdatascience.com/how-to-create-an-interactive-dropdown-in-jupyter-322277f58a68
    """
    
    fig = go.Figure()
    
    # Add points to plot and trace
    for column in df.columns.to_list():
        fig.add_trace(
            go.Scatter(
                x = df.index,
                y = df[column],
                name = column
            )
        )    
    
    # Create the 'All' button that makes all columns visible 
    # by setting 'visible' to True for all columns
    button_all = dict(label = 'All',
                      method = 'update',
                      args = [{'visible': df.columns.isin(df.columns),
                               'title': 'All',
                               'showlegend':True}])

    def create_layout_button(column):
        """
        Create a button for the provided column which will
        make the data associated with the column visible.
        The name property of the plot lines is used to 
        toggle visibility.
        """
        
        return dict(label = column,
                    method = 'update',
                    args = [{'visible': df.columns.isin([column]),
                             'title': column,
                             'showlegend': True}])

    # Create drop down menu and add the 'All'button 
    # and one button for each column in the data frame
    fig.update_layout(
        updatemenus=[go.layout.Updatemenu(
            active = 0,
            buttons = [button_all] + list(df.columns.map(lambda column: create_layout_button(column)))
            )
        ]     
    )
    # Update remaining layout properties
    fig.update_layout(
        title_text=title,
        height=800
        
    )
   
    fig.show()
In [15]:
import seaborn as sns

sns.set(rc={'figure.figsize':(15,12)})

# Plot major admit rates with majors sorted by 
# total number of applicants from low to high
sns.boxplot(data=tot_majors2, x='Admit rate', y='Major name').set(title='Admit Rates of Common Majors')
print('')

In [16]:
# Interactive plot of admit rate by major with 
# UC's sorted by acceptance rate from low to high
drop_down_plot(pivot2, title="Admit Rates Per Campus of Most Common Majors") 
In [17]:
# Interactive plot of lower GPA admit limit per major 
# with UC's sorted by acceptance rate from low to high
drop_down_plot(pivot, title="Lower GPA Admit Limit of Most Common Majors by Campus") 
In [18]:
from scipy import stats

# Test for significance of major with admit rates
group1 = pivot2.iloc[:, 0].dropna().values.tolist()
group2 = pivot2.iloc[:, 1].dropna().values.tolist()
group3 = pivot2.iloc[:, 2].dropna().values.tolist()
group4 = pivot2.iloc[:, 3].dropna().values.tolist()
group5 = pivot2.iloc[:, 4].dropna().values.tolist()
group6 = pivot2.iloc[:, 5].dropna().values.tolist()
group7 = pivot2.iloc[:, 6].dropna().values.tolist()
group8 = pivot2.iloc[:, 7].dropna().values.tolist()

stats.kruskal(group1,group2,group3,group4,group5,group6,group7,group8)

# Since p < 0.05 reject null hypothesis and conclude significant 
# difference between majors in terms of acceptance rate
Out[18]:
KruskalResult(statistic=21.64038461538462, pvalue=0.0029294091653327367)
In [19]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
In [20]:
tot_majors['Admit rate'] = tot_majors['Admit rate'].astype(float)

# Perform Tukey HSD test to test for pairwise significance
tukey = pairwise_tukeyhsd(endog=tot_majors['Admit rate'],
                          groups=tot_majors['Major name'],
                          alpha=0.05)

# Convert results to a pandas data frame
tukey_df = pd.DataFrame(data=tukey._results_table.data[1:], columns=tukey._results_table.data[0])
tukey_df.rename(columns = {'p-adj':'p_adj'}, inplace = True)
tukey_df

# Conclude statistically significant difference between the means of the admit 
# rates for the major pair if p<0.05, indicated also by True in 'reject' column
Out[20]:
group1 group2 meandiff p_adj lower upper reject
0 Anthropology Chemistry -0.0993 0.9582 -0.3881 0.1896 False
1 Anthropology Computer science -0.3912 0.0028 -0.6884 -0.0939 True
2 Anthropology Economics 0.0063 1.0000 -0.2909 0.3035 False
3 Anthropology History 0.0912 0.9775 -0.2060 0.3884 False
4 Anthropology Philosophy 0.0680 0.9961 -0.2293 0.3652 False
5 Anthropology Physics -0.0448 0.9997 -0.3421 0.2524 False
6 Anthropology Sociology 0.0165 1.0000 -0.2808 0.3137 False
7 Chemistry Computer science -0.2919 0.0459 -0.5808 -0.0031 True
8 Chemistry Economics 0.1056 0.9424 -0.1833 0.3944 False
9 Chemistry History 0.1905 0.4433 -0.0984 0.4793 False
10 Chemistry Philosophy 0.1672 0.6086 -0.1216 0.4561 False
11 Chemistry Physics 0.0544 0.9988 -0.2344 0.3433 False
12 Chemistry Sociology 0.1157 0.9093 -0.1731 0.4046 False
13 Computer science Economics 0.3975 0.0022 0.1003 0.6947 True
14 Computer science History 0.4824 0.0001 0.1852 0.7796 True
15 Computer science Philosophy 0.4592 0.0002 0.1619 0.7564 True
16 Computer science Physics 0.3463 0.0119 0.0491 0.6436 True
17 Computer science Sociology 0.4077 0.0016 0.1104 0.7049 True
18 Economics History 0.0849 0.9850 -0.2123 0.3821 False
19 Economics Philosophy 0.0617 0.9979 -0.2356 0.3589 False
20 Economics Physics -0.0512 0.9994 -0.3484 0.2461 False
21 Economics Sociology 0.0102 1.0000 -0.2871 0.3074 False
22 History Philosophy -0.0232 1.0000 -0.3205 0.2740 False
23 History Physics -0.1360 0.8346 -0.4333 0.1612 False
24 History Sociology -0.0747 0.9930 -0.3720 0.2225 False
25 Philosophy Physics -0.1128 0.9304 -0.4100 0.1844 False
26 Philosophy Sociology -0.0515 0.9993 -0.3487 0.2457 False
27 Physics Sociology 0.0613 0.9979 -0.2359 0.3585 False
In [21]:
# Create interactive plot of Tukey HSD test

# DATAFRAME HEADING FOR DOCS
fig = go.Figure(data=[go.Table(
    header=dict(values=list(tukey_df.columns),
                fill_color='cornflowerblue',
                align='left'),
    cells=dict(values=[tukey_df.group1, 
                       tukey_df.group2, 
                       tukey_df.meandiff, 
                       tukey_df.p_adj,
                       tukey_df.lower, 
                       tukey_df.upper, 
                       tukey_df.reject],
               fill_color='ivory',
               align='left'))
])
fig.update_layout(
    title_text = "Tukey Table for Comparisons of Admit Rate Per Major",
    title_font_size=30,
    font_family="Times New Roman",
    font_color="black",
    title_font_family="Times New Roman",
    title_font_color="black",
)
fig.update_traces(cells_font=dict(size = 12))
fig.show()
In [ ]: